# import functions
import pandas as pd
# Read enriched dataset
data_enriched = pd.read_csv("data_enriched.csv", parse_dates=['Zeitstempel'])
# Zeitraum der Daten
print(data_enriched['Zeitstempel'].min())
print(data_enriched['Zeitstempel'].max())
# Drop ArtikelNr und Zeitstempel
data_enriched = data_enriched.drop(['ArtikelNr', 'Zeitstempel', 'Menge_log'], axis=1)
2018-01-02 00:00:00 2022-02-22 00:00:00
# Alle Artikel
all_articles = data_enriched['Artikel'].value_counts().index.tolist()
all_articles
['Laugen-Gipfel of', 'Butter-Gipfel / Croissant of', 'caffè crema', 'St. Galler Handbürli Culinarium *Gold prämiert*', 'Körnergipfel of', 'Weggli', 'Mais-Gipfel of', 'Sonnenblumenbrötli of', 'St. Galler Handb dk. Culinarium *Gold prämiert*', 'Semmel of', 'Nuss-Stengel Original', 'Pain Roule rustico klein of', 'Gallusbrot 400 of', 'Mais-Brötli süss of', 'Berliner m Confi Himbeer of', 'Erdbeertörtli gross ', 'Butterzopf 440 2-teilig ', 'Ziger-Krapfen of']
# Funktionen importieren
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
# Function to calculate the score
def calculate_score(result):
# Calculate score
score = (0.6 * result['R2'] * (1 - result['MAPE']) * (1 / result['RMSE'])) + (0.2 * (1 - result['MAPE']) * (1 / result['RMSE'])) + (0.2 * result['R2'])
return score
# Create dictionary for best_models
best_models = {}
# Dateframe for all metrics
all_results = pd.DataFrame()
# Define hyperparameters for random search
rf_params = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 50],
'min_samples_split': [2, 10],
'min_samples_leaf': [1, 4],
'bootstrap': [True, False]
}
xgb_params = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 50],
'learning_rate': [0.01, 0.1, 0.3],
'subsample': [0.5, 1.0],
'colsample_bytree': [0.4, 1.0],
'min_child_weight': [1, 4]
}
# Loop over articles
for artikel in all_articles:
# Filter by Artikel
data_filtered = data_enriched[data_enriched['Artikel'] == artikel].drop(['Artikel'], axis=1)
X = data_filtered.drop(['Menge'], axis=1)
y = data_filtered['Menge']
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4153)
models = [
('LinearRegression', LinearRegression(), {}),
('RandomForestRegressor', RandomForestRegressor(), rf_params),
('XGBRegressor', XGBRegressor(), xgb_params)
]
# Initialize a variable to keep track of the best RMSE and corresponding model
best_rmse = float('inf')
best_model_info = None
for name, model, params in models:
if params:
# Hyperparameter tuning using RandomizedSearchCV
model_cv = RandomizedSearchCV(model, params, cv=5, n_iter=10, random_state=4153)
model_cv.fit(X_train, y_train)
best_model = model_cv.best_estimator_
else:
best_model = model
best_model.fit(X_train, y_train)
# Predict y
y_pred = best_model.predict(X_test)
# Calculate metrics
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_test, y_pred)
# Save results in a dictionary
result = {'Artikel': artikel, 'R2': r2, 'RMSE': rmse, 'MAPE': mape, 'modelname': name}
# Calculate score
result['score'] = calculate_score(result)
# Add to results dataframe
all_results = pd.concat([all_results, pd.DataFrame([result])], ignore_index=True)
# If it's the best model so far, save it
if rmse < best_rmse:
best_rmse = rmse
best_model_info = {'model': best_model, 'metrics': result}
# Save the best model for this artikel
best_models[artikel] = best_model_info
import pickle
# Saving the objects:
with open('best_models.pkl', 'wb') as f:
pickle.dump(best_models, f)
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results, x='Artikel', y='score', hue='modelname')
plt.title('Scores for each Artikel')
plt.ylabel('Score')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results, x='Artikel', y='R2', hue='modelname')
plt.title('R2 for each Artikel')
plt.ylabel('R2')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results, x='Artikel', y='RMSE', hue='modelname')
plt.title('RMSE for each Artikel')
plt.ylabel('RMSE')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()
from functions import plot_variable_importance
for artikel, models in best_models.items():
model = models['model']
model_name = models['metrics']['modelname']
plot_variable_importance(model, X_train, model_name, artikel)
from sklearn.ensemble import VotingRegressor
# Create dictionary for best_models
best_models_voting = {}
# Dateframe for all metrics
all_results_voting = pd.DataFrame()
# Loop over articles
for artikel in all_articles:
# Filter by Artikel
data_filtered = data_enriched[data_enriched['Artikel'] == artikel].drop(['Artikel'], axis=1)
X = data_filtered.drop(['Menge'], axis=1)
y = data_filtered['Menge']
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4153)
models = [
('LinearRegression', LinearRegression(), {}),
('RandomForestRegressor', RandomForestRegressor(), rf_params),
('XGBRegressor', XGBRegressor(), xgb_params)
]
# Initialize a variable to keep track of the best RMSE and corresponding model
best_rmse = float('inf')
best_model_info = None
# Initialize list for base estimators of VotingRegressor
estimators = []
for name, model, params in models:
if params:
# Hyperparameter tuning using RandomizedSearchCV
model_cv = RandomizedSearchCV(model, params, cv=5, n_iter=10, random_state=4153)
model_cv.fit(X_train, y_train)
best_model = model_cv.best_estimator_
else:
best_model = model
best_model.fit(X_train, y_train)
# Add the model to the estimators list
estimators.append((name, best_model))
# Predict y
y_pred = best_model.predict(X_test)
# Calculate metrics
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_test, y_pred)
# Save results in a dictionary
result = {'Artikel': artikel, 'R2': r2, 'RMSE': rmse, 'MAPE': mape, 'modelname': name}
# Calculate score
result['score'] = calculate_score(result)
# Add to results dataframe
all_results_voting = pd.concat([all_results_voting, pd.DataFrame([result])], ignore_index=True)
# If it's the best model so far, save it
if rmse < best_rmse:
best_rmse = rmse
best_model_info = {'model': best_model, 'metrics': result}
# Voting Regressor
voting_regressor = VotingRegressor(estimators)
voting_regressor.fit(X_train, y_train)
# Predict y
y_pred = voting_regressor.predict(X_test)
# Calculate metrics
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_test, y_pred)
# Save results in a dictionary
result = {'Artikel': artikel, 'R2': r2, 'RMSE': rmse, 'MAPE': mape, 'modelname': 'VotingRegressor'}
# Calculate score
result['score'] = calculate_score(result)
# Add to results dataframe
all_results_voting = pd.concat([all_results_voting, pd.DataFrame([result])], ignore_index=True)
# If it's the best model so far, save it
if rmse < best_rmse:
best_rmse = rmse
best_model_info = {'model': voting_regressor, 'metrics': result}
# Save the best model and metrics for this artikel
best_models_voting[artikel] = best_model_info
import pickle
# Saving the objects:
with open('best_models_voting.pkl', 'wb') as f:
pickle.dump(best_models_voting, f)
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results_voting, x='Artikel', y='score', hue='modelname')
plt.title('Scores for each Artikel')
plt.ylabel('R2')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results_voting, x='Artikel', y='R2', hue='modelname')
plt.title('R2 for each Artikel')
plt.ylabel('R2')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 5))
sns.barplot(data=all_results_voting, x='Artikel', y='RMSE', hue='modelname')
plt.title('RMSE for each Artikel')
plt.ylabel('RMSE')
plt.xlabel('Artikel')
plt.xticks(rotation=90)
plt.legend(title='Model')
plt.show()
from functions import plot_variable_importance
for artikel, models in best_models_voting.items():
model = models['model']
model_name = models['metrics']['modelname']
plot_variable_importance(model, X_train, model_name, artikel)
No feature importances or coefficients available for model Estimator 0
No feature importances or coefficients available for model Estimator 0
No feature importances or coefficients available for model Estimator 0